HW 1 Anderson

Author

Mitchell Anderson

#Question 1
library(readr)
#set working directory and load csv files into the global environment
setwd("/Users/mitchellanderson/Desktop/usc.health.dat.sci")
PM2022<- read.csv("PM_data_2022.csv")
PM2002<- read.csv("PM_data_2002.csv")
#EDA
#check the dimensions, headers, footers, variable names and variable types. Check the distribution of the key variable we are analyzing (PM2.5)
colnames(PM2022)
 [1] "Date"                           "Source"                        
 [3] "Site.ID"                        "POC"                           
 [5] "Daily.Mean.PM2.5.Concentration" "Units"                         
 [7] "Daily.AQI.Value"                "Local.Site.Name"               
 [9] "Daily.Obs.Count"                "Percent.Complete"              
[11] "AQS.Parameter.Code"             "AQS.Parameter.Description"     
[13] "Method.Code"                    "Method.Description"            
[15] "CBSA.Code"                      "CBSA.Name"                     
[17] "State.FIPS.Code"                "State"                         
[19] "County.FIPS.Code"               "County"                        
[21] "Site.Latitude"                  "Site.Longitude"                
colnames(PM2002)
 [1] "Date"                           "Source"                        
 [3] "Site.ID"                        "POC"                           
 [5] "Daily.Mean.PM2.5.Concentration" "Units"                         
 [7] "Daily.AQI.Value"                "Local.Site.Name"               
 [9] "Daily.Obs.Count"                "Percent.Complete"              
[11] "AQS.Parameter.Code"             "AQS.Parameter.Description"     
[13] "Method.Code"                    "Method.Description"            
[15] "CBSA.Code"                      "CBSA.Name"                     
[17] "State.FIPS.Code"                "State"                         
[19] "County.FIPS.Code"               "County"                        
[21] "Site.Latitude"                  "Site.Longitude"                
dim(PM2022)
[1] 59918    22
dim(PM2002)
[1] 15976    22
head(PM2022)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
1 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
  Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1              58       Livermore               1              100
2              60       Livermore               1              100
3              39       Livermore               1              100
4              21       Livermore               1              100
5              23       Livermore               1              100
6              21       Livermore               1              100
  AQS.Parameter.Code AQS.Parameter.Description Method.Code
1              88101  PM2.5 - Local Conditions         170
2              88101  PM2.5 - Local Conditions         170
3              88101  PM2.5 - Local Conditions         170
4              88101  PM2.5 - Local Conditions         170
5              88101  PM2.5 - Local Conditions         170
6              88101  PM2.5 - Local Conditions         170
                    Method.Description CBSA.Code
1 Met One BAM-1020 Mass Monitor w/VSCC     41860
2 Met One BAM-1020 Mass Monitor w/VSCC     41860
3 Met One BAM-1020 Mass Monitor w/VSCC     41860
4 Met One BAM-1020 Mass Monitor w/VSCC     41860
5 Met One BAM-1020 Mass Monitor w/VSCC     41860
6 Met One BAM-1020 Mass Monitor w/VSCC     41860
                          CBSA.Name State.FIPS.Code      State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA               6 California                1
2 San Francisco-Oakland-Hayward, CA               6 California                1
3 San Francisco-Oakland-Hayward, CA               6 California                1
4 San Francisco-Oakland-Hayward, CA               6 California                1
5 San Francisco-Oakland-Hayward, CA               6 California                1
6 San Francisco-Oakland-Hayward, CA               6 California                1
   County Site.Latitude Site.Longitude
1 Alameda      37.68753      -121.7842
2 Alameda      37.68753      -121.7842
3 Alameda      37.68753      -121.7842
4 Alameda      37.68753      -121.7842
5 Alameda      37.68753      -121.7842
6 Alameda      37.68753      -121.7842
tail(PM2022)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
59913 12/01/2022    AQS 61131003   1                            3.4 ug/m3 LC
59914 12/07/2022    AQS 61131003   1                            3.8 ug/m3 LC
59915 12/13/2022    AQS 61131003   1                            6.0 ug/m3 LC
59916 12/19/2022    AQS 61131003   1                           34.8 ug/m3 LC
59917 12/25/2022    AQS 61131003   1                           23.2 ug/m3 LC
59918 12/31/2022    AQS 61131003   1                            1.0 ug/m3 LC
      Daily.AQI.Value      Local.Site.Name Daily.Obs.Count Percent.Complete
59913              19 Woodland-Gibson Road               1              100
59914              21 Woodland-Gibson Road               1              100
59915              33 Woodland-Gibson Road               1              100
59916              99 Woodland-Gibson Road               1              100
59917              77 Woodland-Gibson Road               1              100
59918               6 Woodland-Gibson Road               1              100
      AQS.Parameter.Code AQS.Parameter.Description Method.Code
59913              88101  PM2.5 - Local Conditions         145
59914              88101  PM2.5 - Local Conditions         145
59915              88101  PM2.5 - Local Conditions         145
59916              88101  PM2.5 - Local Conditions         145
59917              88101  PM2.5 - Local Conditions         145
59918              88101  PM2.5 - Local Conditions         145
                                         Method.Description CBSA.Code
59913 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59914 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59915 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59916 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59917 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
59918 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC     40900
                                    CBSA.Name State.FIPS.Code      State
59913 Sacramento--Roseville--Arden-Arcade, CA               6 California
59914 Sacramento--Roseville--Arden-Arcade, CA               6 California
59915 Sacramento--Roseville--Arden-Arcade, CA               6 California
59916 Sacramento--Roseville--Arden-Arcade, CA               6 California
59917 Sacramento--Roseville--Arden-Arcade, CA               6 California
59918 Sacramento--Roseville--Arden-Arcade, CA               6 California
      County.FIPS.Code County Site.Latitude Site.Longitude
59913              113   Yolo      38.66121      -121.7327
59914              113   Yolo      38.66121      -121.7327
59915              113   Yolo      38.66121      -121.7327
59916              113   Yolo      38.66121      -121.7327
59917              113   Yolo      38.66121      -121.7327
59918              113   Yolo      38.66121      -121.7327
head(PM2002)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
1 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
  Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1              81       Livermore               1              100
2              93       Livermore               1              100
3              74       Livermore               1              100
4              82       Livermore               1              100
5              98       Livermore               1              100
6             115       Livermore               1              100
  AQS.Parameter.Code AQS.Parameter.Description Method.Code
1              88101  PM2.5 - Local Conditions         120
2              88101  PM2.5 - Local Conditions         120
3              88101  PM2.5 - Local Conditions         120
4              88101  PM2.5 - Local Conditions         120
5              88101  PM2.5 - Local Conditions         120
6              88101  PM2.5 - Local Conditions         120
                     Method.Description CBSA.Code
1 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
2 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
3 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
4 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
5 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
6 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
                          CBSA.Name State.FIPS.Code      State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA               6 California                1
2 San Francisco-Oakland-Hayward, CA               6 California                1
3 San Francisco-Oakland-Hayward, CA               6 California                1
4 San Francisco-Oakland-Hayward, CA               6 California                1
5 San Francisco-Oakland-Hayward, CA               6 California                1
6 San Francisco-Oakland-Hayward, CA               6 California                1
   County Site.Latitude Site.Longitude
1 Alameda      37.68753      -121.7842
2 Alameda      37.68753      -121.7842
3 Alameda      37.68753      -121.7842
4 Alameda      37.68753      -121.7842
5 Alameda      37.68753      -121.7842
6 Alameda      37.68753      -121.7842
tail(PM2002)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    Units
15971 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
15972 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
15973 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
15974 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
15975 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
15976 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
      Daily.AQI.Value      Local.Site.Name Daily.Obs.Count Percent.Complete
15971              62 Woodland-Gibson Road               1              100
15972              62 Woodland-Gibson Road               1              100
15973               6 Woodland-Gibson Road               1              100
15974              77 Woodland-Gibson Road               1              100
15975              28 Woodland-Gibson Road               1              100
15976              33 Woodland-Gibson Road               1              100
      AQS.Parameter.Code AQS.Parameter.Description Method.Code
15971              88101  PM2.5 - Local Conditions         117
15972              88101  PM2.5 - Local Conditions         117
15973              88101  PM2.5 - Local Conditions         117
15974              88101  PM2.5 - Local Conditions         117
15975              88101  PM2.5 - Local Conditions         117
15976              88101  PM2.5 - Local Conditions         117
                         Method.Description CBSA.Code
15971 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15972 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15973 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15974 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15975 R & P Model 2000 PM2.5 Sampler w/WINS     40900
15976 R & P Model 2000 PM2.5 Sampler w/WINS     40900
                                    CBSA.Name State.FIPS.Code      State
15971 Sacramento--Roseville--Arden-Arcade, CA               6 California
15972 Sacramento--Roseville--Arden-Arcade, CA               6 California
15973 Sacramento--Roseville--Arden-Arcade, CA               6 California
15974 Sacramento--Roseville--Arden-Arcade, CA               6 California
15975 Sacramento--Roseville--Arden-Arcade, CA               6 California
15976 Sacramento--Roseville--Arden-Arcade, CA               6 California
      County.FIPS.Code County Site.Latitude Site.Longitude
15971              113   Yolo      38.66121      -121.7327
15972              113   Yolo      38.66121      -121.7327
15973              113   Yolo      38.66121      -121.7327
15974              113   Yolo      38.66121      -121.7327
15975              113   Yolo      38.66121      -121.7327
15976              113   Yolo      38.66121      -121.7327
str(PM2022)
'data.frame':   59918 obs. of  22 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily.Mean.PM2.5.Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily.AQI.Value               : int  58 60 39 21 23 21 13 38 59 55 ...
 $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method.Code                   : int  170 170 170 170 170 170 170 170 170 170 ...
 $ Method.Description            : chr  "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ...
 $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
str(PM2002)
'data.frame':   15976 obs. of  22 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily.Mean.PM2.5.Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ Daily.AQI.Value               : int  81 93 74 82 98 115 89 62 69 107 ...
 $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ Method.Code                   : int  120 120 120 120 120 120 120 120 120 120 ...
 $ Method.Description            : chr  "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ...
 $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
 $ State                         : chr  "California" "California" "California" "California" ...
 $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
summary(PM2022$Daily.Mean.PM2.5.Concentration)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -6.700   4.100   6.800   8.414  10.700 302.500 
summary(PM2002$Daily.Mean.PM2.5.Concentration)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    7.00   12.00   16.12   20.50  104.30 

The exploratory data analysis reveals the column names, which remain identical and consist of the same names and header quantities. The 2002 data has 15,976 observations while the 2022 data has 59,918 observations. After performing statistical analysis, is is shown that the 2002 data has a mean of 16.12, median of 12, a max of 104.30, a min of 0, 1st quartile of 7, and 3rd quartile of 104.30. The 2022 data had a mean of 8.414, median of 6.8, min of -6.7, max of 302.500, 1st quartile of 4.1, and 3rd quartile of 10.7.

#Question 2
##search for how to combine data frames in r
##create new variable called 'year', 2022 for one dataset and 2002. Stratify only by the two years, and not by month and day
##New variable names, (anything used to answer the final question)
PM_all <- rbind(PM2002, PM2022)
# Convert Date column to Date format (from mm/dd/yyyy)
PM_all$Date <- as.Date(PM_all$Date, format = "%m/%d/%Y")
# Create a new column "date" that stratifies only by year
PM_all$date <- format(PM_all$Date, "%Y")

#remove rows with missing coordinates
PM_all <- PM_all[!is.na(PM_all$Site.Latitude) & !is.na(PM_all$Site.Longitude), ]
#Question 3
##For leaflet, see code from lecture. subset only california
library(leaflet)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
leaflet(PM_all) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~Site.Longitude,lat = ~Site.Latitude,color = ~ifelse(date == 2002,"green", "red"),radius = 3,opacity = 0.8,popup = ~paste("Year:", date)) %>%
  #add legend with labels and title
  addLegend(position = "bottomright",colors = c("green", "red"),labels = c("2002","2022"),title = "Monitoring Sites")
#Question 4
#call the combined dataset
data_check<-PM_all%>%
#stratify by year
  group_by(date)%>%              

summarize(
  obs_tot=n(),
  #find negative PM2.5 observations
  neg_obs=sum(Daily.Mean.PM2.5.Concentration < 0, na.rm = TRUE), 
  #find missing PM2.5 observations
  missing_obs=sum(is.na(Daily.Mean.PM2.5.Concentration)),
  #remove grouping variables from output
  .groups="drop"
) %>% 
  #calculate proportions of missing and negative values from 2002 and 2022 datasets
  mutate(
    neg_proportion=neg_obs/obs_tot,
    missing_proportion=missing_obs/obs_tot
  )

Question 4 creates a new table titled “data_check” which displays the negative and missing values from the daily mean PM 2.5 concentrations, stratified by year. The table shows that neither data set had missing observations and that the 2022 data had 215 missing observations with a proportion of 0.0036.

#Question 5
##3 Levels: State level across California: how have they changed? (mean median, SD),by county, then just LA- office hour notes
#summarize() and arrange() functions
library(ggplot2)
##Statewide filter
sum_statewide <- PM_all %>% 
  group_by(date) %>% 
  #stat summaries, for min, max, mean, and median
  summarize(
    PM25_state_min = min(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    PM25_state_max = max(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    PM25_state_mean = mean(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    PM25_state_median = median(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    .groups="drop"
  )
print(sum_statewide)
# A tibble: 2 × 5
  date  PM25_state_min PM25_state_max PM25_state_mean PM25_state_median
  <chr>          <dbl>          <dbl>           <dbl>             <dbl>
1 2002             0             104.           16.1               12  
2 2022            -6.7           302.            8.41               6.8
#make boxplot
ggplot(PM_all, aes(x = factor(date), 
                   y = Daily.Mean.PM2.5.Concentration, 
                   fill = factor(date))) +
  geom_boxplot(outlier.size = 0.9, outlier.alpha = 0.7) +
  labs(title = "2002 & 2022 California Daily Mean PM2.5 Levels ",
       x = "Year",
       y = "Daily Mean PM2.5 Levels",
       fill = "Year")+
  theme_minimal(base_size=20)

##county
sum_countywide <- PM_all%>%
  group_by(County,date)%>%
  summarize(
    PM25_county_min=min(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    PM25_county_max=max(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    PM25_county_mean=mean(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    PM25_county_median=median(Daily.Mean.PM2.5.Concentration,na.rm=TRUE),
    .groups="drop"
  )
#filter and print counties with top ten highest mean pm2.5 levels in 2002 and 2022
print(sum_countywide)
# A tibble: 98 × 6
   County       date  PM25_county_min PM25_county_max PM25_county_mean
   <chr>        <chr>           <dbl>           <dbl>            <dbl>
 1 Alameda      2002              1.9            61.6            14.3 
 2 Alameda      2022             -0.7            35.5             8.20
 3 Butte        2002              1              88              14.8 
 4 Butte        2022             -0.6            42.8             6.19
 5 Calaveras    2002              2              40               9.9 
 6 Calaveras    2022              0              25.9             6.04
 7 Colusa       2002              1              57              11.7 
 8 Colusa       2022              0.6            37               7.61
 9 Contra Costa 2002              2              76.7            15.1 
10 Contra Costa 2022              0.9            37.3             8.24
# ℹ 88 more rows
# ℹ 1 more variable: PM25_county_median <dbl>
top10_countywide_02_22 <- sum_countywide%>%
  filter(date %in% c(2002, 2022))%>%
  group_by(date)%>%
  arrange(desc(PM25_county_mean),.by_group=TRUE)%>%
  slice_head(n = 10)%>%
  ungroup()

print(top10_countywide_02_22)
# A tibble: 20 × 6
   County      date  PM25_county_min PM25_county_max PM25_county_mean
   <chr>       <chr>           <dbl>           <dbl>            <dbl>
 1 Kings       2002              2.8            90.7             24.7
 2 Tulare      2002              0.6            82               23.1
 3 Kern        2002              0.2           104.              23.0
 4 Riverside   2002              1.5            78.1             22.4
 5 Merced      2002              3.6            66               21.5
 6 Fresno      2002              0.1            92.5             19.9
 7 Stanislaus  2002              3              83               19.7
 8 Los Angeles 2002              0.6            72.4             19.7
 9 Orange      2002              3.8            68.6             17.8
10 San Joaquin 2002              3              64               16.8
11 Kings       2022              0.5            62.9             14.4
12 Stanislaus  2022             -0.2            53.9             12.1
13 Plumas      2022              0              89.2             11.2
14 Los Angeles 2022             -1.2            56               11.0
15 Trinity     2022             -0.6           246.              10.7
16 Tulare      2022             -2              48.3             10.4
17 Madera      2022              0.4            40.2             10.4
18 Fresno      2022             -0.3            55.8             10.2
19 Mendocino   2022             -1.3            36.2             10.1
20 Merced      2022              0.8            43.7             10.1
# ℹ 1 more variable: PM25_county_median <dbl>
#summarize median pm25 levels for each county and year
sum_countywide <- PM_all %>%
  group_by(County,date) %>%
  summarize(PM25_county_median = median(Daily.Mean.PM2.5.Concentration, na.rm = TRUE),
            .groups="drop")
#top 10 medians
top_counties <- sum_countywide%>%
  group_by(County)%>%
  summarize(max_median_PM25=max(PM25_county_median, na.rm=TRUE))%>%
  arrange(desc(max_median_PM25))%>%
  slice_head(n=10)%>%
  pull(County)
#filter to top ten counties
PM25_10 <- PM_all %>%
  filter(County %in% top_counties)
#Box plots for top 10 counties with greatest daily mean pm25 levels
ggplot(PM25_10, aes(x = factor(date),
                     y = Daily.Mean.PM2.5.Concentration,
                     fill = factor(date)))+
  geom_boxplot(outlier.size = 0.9, outlier.alpha = 0.7)+
  facet_wrap(~ County, scales = "free_y")+
  labs(title = "Top 10 Counties with Highest Daily Mean PM2.5 in 2002 and 2022",
       x = "Year",
       y = "Top 10 Mean PM2.5 Levels by County",
       fill = "Year")+
  theme_minimal(base_size = 12)

##Los Angeles
#stratify by LA and years
LA_sites <- PM_all %>%
  filter(Local.Site.Name == "Los Angeles-North Main Street",
         date %in% c(2002, 2022))   
#stat summary for LA
sum_citywide <- LA_sites %>%
  group_by(date) %>%   
  summarize(
    PM25_city_mean=mean(Daily.Mean.PM2.5.Concentration, na.rm=TRUE),
    PM_25_city_median=median(Daily.Mean.PM2.5.Concentration, na.rm=TRUE),
    PM25_city_min=min(Daily.Mean.PM2.5.Concentration, na.rm=TRUE),
    PM25_city_max=max(Daily.Mean.PM2.5.Concentration, na.rm=TRUE)
  )

print(sum_citywide)
# A tibble: 2 × 5
  date  PM25_city_mean PM_25_city_median PM25_city_min PM25_city_max
  <chr>          <dbl>             <dbl>         <dbl>         <dbl>
1 2002            22.0              19.3           3.9          66.3
2 2022            11.6              10.9           2.4          38  
#create boxplot
ggplot(LA_sites,aes(x = factor(date),y = Daily.Mean.PM2.5.Concentration,fill = factor(date)))+
  geom_boxplot(outlier.size=0.9,outlier.alpha=0.7)+
  labs(title="Los Angeles Daily Mean PM2.5 Levels 2002 vs 2022",
       x ="Year",
       y ="Daily Mean PM2.5 Levels",
       fill="Year")+
  theme_minimal(base_size=14)

The statistical summaries across the state, between counties, and in the city of Los Angeles reveal multiple characteristics of the PM 2.5 levels from 2002 to 2022. At the state level, there was a decrease in daily mean PM 2.5 concentrations. At the county level, the top ten counties that had the greatest mean concentrations between the two years were Fresno, Kern, Kings, Los Angeles, Merced, Orange, Riverside, San Diego, Tulare, and Ventura. In the city of Los Angeles alone, there was a decrease in the daily mean levels of PM 2.5 from 2002 to 2022.